R/palimpsest_input.R

Defines functions count_triplets palimpsest_input

Documented in count_triplets palimpsest_input

#' palimpsest_input
#'
#' Creates the list of input matrices of the numbers and proportions of each mutation category (of one mutation type, e.g. SBS mutation categories) in each sample.
#' @param vcf The input VCF file from which a matrix of mutation categories is to be made.
#' @param mutypes leave blank to use default COSMIC categories (recommeneded), or add your own. 
#' @param Type Mutation type (SBS, DBS, ID or SV).
#' @keywords Signatures
#' @export
#' @examples
#' DBS_input <- palimpsest_input(vcf=vcf, Type = "DBS")



palimpsest_input <- function(vcf = NULL, Type = NULL, mutypes = NA){
  
  if(Type == "SBS") mutcat.col <- "SBS_cat3"; if(Type == "DBS") mutcat.col <- "DBS_cat"; if(Type == "ID") mutcat.col <- "ID_cat"; if(Type == "SV") mutcat.col <-"Category1"
  if(mutcat.col %!in% colnames(vcf)) stop(paste("vcf is missing the",Type,"mutation category column. Use the << annotate_VCF >> function to add the appropriate column."))
  
  ordering <- unique(vcf$Sample)
  if(is.na(mutypes)){
     if(Type == "SBS"){
      bases <- c("A", "C", "G", "T")
      ctxt16 <- paste(rep(bases, each = 4), rep(bases, 4), 
                      sep = ".")
      mt <- c("CA", "CG", "CT", "TA", "TC", "TG")
      Types96 <- paste(rep(mt, each = 16), rep(ctxt16, 6), 
                       sep = "_")
      mutypes <- paste0(substr(Types96,1,4), substr(Types96,1,1), substr(Types96,6,6))
      
    }
    if(Type == "DBS"){
      mutypes <- c("AC>CA", "AC>CG", "AC>CT", "AC>GA", "AC>GG", "AC>GT", "AC>TA", "AC>TG", "AC>TT", "AT>CA", "AT>CC", "AT>CG", "AT>GA", "AT>GC", "AT>TA", "CC>AA", "CC>AG",
                   "CC>AT", "CC>GA" ,"CC>GG" ,"CC>GT", "CC>TA", "CC>TG", "CC>TT", "CG>AA", "CG>AC", "CG>AT", "CG>GA" ,"CG>GC" ,"CG>TA", "CT>AA", "CT>AC", "CT>AG", "CT>GA",
                   "CT>GC" ,"CT>GG" ,"CT>TA" ,"CT>TC", "CT>TG", "GC>AA", "GC>AG", "GC>AT" ,"GC>CA" ,"GC>CG" ,"GC>TA" ,"TA>AC", "TA>AG", "TA>AT", "TA>CC", "TA>CG", "TA>GC",
                   "TC>AA", "TC>AG", "TC>AT", "TC>CA", "TC>CG", "TC>CT", "TC>GA", "TC>GG", "TC>GT", "TG>AA", "TG>AC", "TG>AT", "TG>CA", "TG>CC", "TG>CT", "TG>GA", "TG>GC",
                   "TG>GT", "TT>AA", "TT>AC", "TT>AG", "TT>CA", "TT>CC", "TT>CG", "TT>GA", "TT>GC", "TT>GG")
    }
    if(Type == "ID"){
      mutypes = c("DEL_C_1_0","DEL_C_1_1","DEL_C_1_2","DEL_C_1_3","DEL_C_1_4","DEL_C_1_5+", "DEL_T_1_0","DEL_T_1_1","DEL_T_1_2",
                  "DEL_T_1_3","DEL_T_1_4","DEL_T_1_5+", "INS_C_1_0","INS_C_1_1","INS_C_1_2","INS_C_1_3","INS_C_1_4","INS_C_1_5+", 
                  "INS_T_1_0","INS_T_1_1","INS_T_1_2","INS_T_1_3" ,"INS_T_1_4","INS_T_1_5+","DEL_repeats_2_0","DEL_repeats_2_1",
                  "DEL_repeats_2_2","DEL_repeats_2_3","DEL_repeats_2_4","DEL_repeats_2_5+","DEL_repeats_3_0",
                  "DEL_repeats_3_1","DEL_repeats_3_2","DEL_repeats_3_3","DEL_repeats_3_4","DEL_repeats_3_5+", 
                  "DEL_repeats_4_0","DEL_repeats_4_1","DEL_repeats_4_2","DEL_repeats_4_3","DEL_repeats_4_4","DEL_repeats_4_5+",
                  "DEL_repeats_5+_0","DEL_repeats_5+_1","DEL_repeats_5+_2","DEL_repeats_5+_3","DEL_repeats_5+_4","DEL_repeats_5+_5+",
                  "INS_repeats_2_0","INS_repeats_2_1","INS_repeats_2_2","INS_repeats_2_3","INS_repeats_2_4","INS_repeats_2_5+" ,
                  "INS_repeats_3_0","INS_repeats_3_1","INS_repeats_3_2","INS_repeats_3_3","INS_repeats_3_4","INS_repeats_3_5+" ,
                  "INS_repeats_4_0","INS_repeats_4_1","INS_repeats_4_2","INS_repeats_4_3","INS_repeats_4_4","INS_repeats_4_5+", 
                  "INS_repeats_5+_0","INS_repeats_5+_1","INS_repeats_5+_2","INS_repeats_5+_3","INS_repeats_5+_4","INS_repeats_5+_5+",
                  "DEL_MH_2_1","DEL_MH_3_1","DEL_MH_3_2","DEL_MH_4_1","DEL_MH_4_2","DEL_MH_4_3",      
                  "DEL_MH_5+_1","DEL_MH_5+_2","DEL_MH_5+_3","DEL_MH_5+_4","DEL_MH_5+_5+")
        mutypes <- gsub("_",",",mutypes)
      
    }
  }
  if(Type == "SV"){
    tt <- as.matrix(table(vcf[, mutcat.col], vcf$Sample))
    mat <- matrix(as.numeric(tt), nrow = nrow(tt), ncol = ncol(tt))
    rownames(mat) <- rownames(tt)
    colnames(mat) <- colnames(tt)
    data <- mat
    categs <- c("DEL_0-1kb_clust_1", "DEL_1-10kb_clust_1", 
                "DEL_10-100kb_clust_1", "DEL_100kb-1Mb_clust_1", 
                "DEL_1-10Mb_clust_1", "DEL_>10Mb_clust_1", "DUP_0-1kb_clust_1", 
                "DUP_1-10kb_clust_1", "DUP_10-100kb_clust_1", "DUP_100kb-1Mb_clust_1", 
                "DUP_1-10Mb_clust_1", "DUP_>10Mb_clust_1", "INV_0-1kb_clust_1", 
                "INV_1-10kb_clust_1", "INV_10-100kb_clust_1", "INV_100kb-1Mb_clust_1", 
                "INV_1-10Mb_clust_1", "INV_>10Mb_clust_1", "BND_clust_1", 
                "DEL_0-1kb_clust_0", "DEL_1-10kb_clust_0", "DEL_10-100kb_clust_0", 
                "DEL_100kb-1Mb_clust_0", "DEL_1-10Mb_clust_0", "DEL_>10Mb_clust_0", 
                "DUP_0-1kb_clust_0", "DUP_1-10kb_clust_0", "DUP_10-100kb_clust_0", 
                "DUP_100kb-1Mb_clust_0", "DUP_1-10Mb_clust_0", "DUP_>10Mb_clust_0", 
                "INV_0-1kb_clust_0", "INV_1-10kb_clust_0", "INV_10-100kb_clust_0", 
                "INV_100kb-1Mb_clust_0", "INV_1-10Mb_clust_0", "INV_>10Mb_clust_0", 
                "BND_clust_0")
    missings_cat = setdiff(categs, rownames(data))
    for (i in missings_cat) {
      data = rbind(data, assign(i, rep(0, ncol(data))))
      rownames(data)[nrow(data)] = i
    }
   
      props <- prop.table(data, 2)
      res = list(mut_nums = data, mut_props = props)
    return(res)
  }else{
    tmp <- split(vcf, vcf$Sample)
    truenames <- names(tmp)
    tmp <- lapply(tmp, function(d) {
      sapply(mutypes, function(m) {
        sum(d[, mutcat.col] == m, na.rm = T)
      })
    })
    names(tmp) <- truenames; tmp <- tmp[ordering]
    nums <- as.matrix(as.data.frame(tmp))
    
    if(Type == "DBS"){
      trip_nums <- count_triplets(vcf, ordering, mutypes)
      
      nums <- nums - trip_nums
      nums <- nums/2
      nums <- nums+trip_nums
    }
    
    colnames(nums) <- ordering; rownames(nums) <- mutypes
    nums <- as.matrix(nums[,(colSums(nums)>0)])
    props <- nums
    for (j in 1:ncol(props)) props[, j] <- props[, j]/sum(props[,j])
    
    res = list(mut_nums = as.matrix(nums), mut_props = as.matrix(props))
    return(res)
  }
}








#' count_triplets
#'
#' Counts the number of DBS mutations that are in fact triple base substituions. Used inside "palimpsest_input2", not designed to be used outside this function.
#' @param vcf The input vcf file.
#' @param ordering the ordering of samples from palimpsest_input2 function.
#' @param mutypes The DBS mutypes from palimpsest_input2 function.
#' @keywords Signatures
#' @export
#' @examples
#' trip_nums <- count_triplets(vcf, ordering, mutypes)



count_triplets <- function(vcf, ordering, mutypes){
  
  res <- as.data.frame(matrix(ncol = 4,nrow = nrow(vcf)))
  lizt <- c()
  for(i in 1:nrow(vcf)){
    lizt[i] <- vcf$POS[i]
    if(i>2){  
      if(lizt[i]==(lizt[i-1])+1){
        if(lizt[i]==(lizt[i-2])+2){
          res[i,] <- vcf[i,c("Sample","CHROM","POS","DBS_cat")]
          res[i-1,] <- vcf[i-1,c("Sample","CHROM","POS","DBS_cat")]
          res[i-2,] <- vcf[i-2,c("Sample","CHROM","POS","DBS_cat")]
        }
      }
    }
  }
  colnames(res) <- c("Sample","CHROM","POS","DBS_cat")
  res <- res[-which(is.na(res[,1])),]
  newres <- as.data.frame(matrix(ncol = 4,nrow = nrow(res)));   colnames(newres) <- c("Sample","CHROM","POS","DBS_cat")
  for(i in 1:(nrow(res)/3)){
    tmp <- res[((3*i)-2):(3*i),]
    if(length(unique(tmp$CHROM))==1) newres[((3*i)-2):(3*i),] <- tmp
  }
  res <- newres[which(!is.na(newres$Sample)),]
  if(nrow(res)>0){
    res$rm <- NA
    for(i in 2:(nrow(res)-1)){
      if(res$DBS_cat[i] == res$DBS_cat[i-1] & res$Sample[i]==res$Sample[i-1] &  res$CHROM[i]==res$CHROM[i-1]){
        if(i==2){ 
          res$rm[i] <- "rm"
          res$rm[i-1] <- "rm"
        }else{
          if(res$DBS_cat[i] != res$DBS_cat[i-2] & res$POS[i] != (res$POS[i-2])+2){ 
            res$rm[i] <- "rm"
            res$rm[i-1] <- "rm"
          }
          if(res$DBS_cat[i] == res$DBS_cat[i-1] & 
             res$POS[i] == (res$POS[i-1])+1 & 
             res$POS[i] == (res$POS[i+1])-1 & 
             res$Sample[i] == res$Sample[i+1] & 
             res$Sample[i-1] == res$Sample[i]){ 
            res$rm[i] <- "rm"
            res$rm[i-1] <- "rm"
          }
        }
      }
    }
    
    
    res <- res[-which(res$rm == "rm"),] %>% 
      dplyr::select(Sample, DBS_cat)
    
    
    tmp <- split(res, res$Sample)
    tripnames <- names(tmp)
    tmp <- lapply(tmp, function(d) {
      sapply(mutypes, function(m) {
        sum(d[, "DBS_cat"] == m, na.rm = T)
      })
    })
    names(tmp) <- tripnames
    
    trip_nums <- as.data.frame(tmp)
    for(i in 1:length(ordering)){
      if(ordering[i] %!in% colnames(trip_nums)){
        new_col_num <- ncol(trip_nums)+1
        trip_nums[,new_col_num] <- 0; colnames(trip_nums)[new_col_num] <- ordering[i]
      }
    }
    trip_nums <- trip_nums[,ordering]
  }else{
    trip_nums <- as.data.frame(matrix(ncol = length(ordering),nrow = length(mutypes)))
    colnames(trip_nums) <- ordering; rownames(trip_nums) <- mutypes
    trip_nums[] <- 0
  }
  return(trip_nums)
}






#' makeMutypeMatFromVcf
#'
#' Function to create a matrix in mutation type x sample format with either counts or proportions
#' @param vcf vcf data frame containing the mutations/SVs
#' @param sample.col sample.col
#' @param mutcat.col mutcat.col
#' @param mutypes mutypes
#' @param proportion proportion
#'
#' @export


makeMutypeMatFromVcf <- function (vcf,
                                  sample.col = "Sample",
                                  mutcat.col = "SBS_cat3",
                                  mutypes = c("CA", "CG", "CT", "TA", "TC", "TG"),
                                  proportion = TRUE)
{
  tmp <- split(vcf, vcf[, sample.col])
  tmp <- lapply(tmp, function(d) {
    sapply(mutypes, function(m) {
      sum(d[, mutcat.col] == m, na.rm = T)
    })
  })
  tmp <- as.matrix(as.data.frame(tmp))
  if (proportion) {
    for (j in 1:ncol(tmp)) tmp[,j] <- tmp[, j]/sum(tmp[,j])
  }
  tmp
}
FunGeST/Palimpsest documentation built on June 2, 2024, 4:21 a.m.